library(dplyr)
## 
## Attachement du package : 'dplyr'
## Les objets suivants sont masqués depuis 'package:stats':
## 
##     filter, lag
## Les objets suivants sont masqués depuis 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)
library(purrr)
knitr::opts_chunk$set(cache = FALSE, autodep = TRUE)
# List all cv files in the directory Data
csv_files <- fs::dir_ls("./RawData", regexp = "\\.csv$") # nolint: line_length_linter.

# Check if files are detected
if (length(csv_files) == 0) {
  message("No CSV files found in the directory.")
}


# Read all cv files
csv_data <- csv_files %>%
        map_dfr(~ read_csv(.x, col_types = cols()))
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## • `` -> `...1`
if (nrow(csv_data) == 0) {
  print("CSV files are read but no data is present.")
}

# Afficher les premières lignes du nouveau RawData frame pour vérification
head(csv_data)
## # A tibble: 6 × 8
##    ...1 Maternal_age Maternal_weight Maternal_height Parity Sex   
##   <dbl>        <dbl>           <dbl>           <dbl>  <dbl> <chr> 
## 1  6205         31.8            53.8             158      0 Female
## 2  5641         28.5            61               165      0 Female
## 3  2740         34.9            53.5             163      0 Female
## 4  6111         36.6            52               160      1 Male  
## 5  6327         34.3            58               156      1 Male  
## 6  5753         35.3            73               170      1 Female
## # ℹ 2 more variables: Gestational_age <dbl>, Weight <dbl>
data_d <- distinct(csv_data)
# Afficher les premières lignes du nouveau RawData frame pour vérification  
head(data_d)
## # A tibble: 6 × 8
##    ...1 Maternal_age Maternal_weight Maternal_height Parity Sex   
##   <dbl>        <dbl>           <dbl>           <dbl>  <dbl> <chr> 
## 1  6205         31.8            53.8             158      0 Female
## 2  5641         28.5            61               165      0 Female
## 3  2740         34.9            53.5             163      0 Female
## 4  6111         36.6            52               160      1 Male  
## 5  6327         34.3            58               156      1 Male  
## 6  5753         35.3            73               170      1 Female
## # ℹ 2 more variables: Gestational_age <dbl>, Weight <dbl>

Première analyse des données

df <- data_d
summary(df)
##       ...1       Maternal_age   Maternal_weight  Maternal_height
##  Min.   :   1   Min.   :14.02   Min.   :  0.00   Min.   :138.0  
##  1st Qu.:2279   1st Qu.:30.04   1st Qu.: 56.00   1st Qu.:160.0  
##  Median :4557   Median :33.16   Median : 62.00   Median :163.0  
##  Mean   :4557   Mean   :32.56   Mean   : 64.05   Mean   :163.5  
##  3rd Qu.:6835   3rd Qu.:35.74   3rd Qu.: 69.30   3rd Qu.:168.0  
##  Max.   :9113   Max.   :46.82   Max.   :135.00   Max.   :188.0  
##                                 NA's   :3        NA's   :5      
##      Parity            Sex            Gestational_age     Weight      
##  Min.   : 0.0000   Length:9113        Min.   :132.0   Min.   : 150.1  
##  1st Qu.: 0.0000   Class :character   1st Qu.:147.2   1st Qu.: 396.6  
##  Median : 0.0000   Mode  :character   Median :245.9   Median :2474.0  
##  Mean   : 0.5621                      Mean   :225.5   Mean   :2096.2  
##  3rd Qu.: 1.0000                      3rd Qu.:275.5   3rd Qu.:3100.0  
##  Max.   :10.0000                      Max.   :301.8   Max.   :4930.0  
##  NA's   :4
Nous avons 6 colonnes dans notre jeu de données, dont 5 sont numériques et 1 est catégorielle.
Il existe 10 valeurs différentes dans la colonne Parity, et les répartitions sont très inégales. Il faudra garder cela en tête lors de l’analyse des résultats du modèle.
Dans un cadre éthique, nous vérifions la répartition de chaque sexe dans notre jeu de données.
freq_sex <- table(df$Sex)
print(freq_sex)
## 
## Female   Male 
##   4556   4557
Il y a une répartition égale entre les enfants de sexe masculin et féminin, ce qui est un bon signe pour l’analyse des données.
Nous n’avons plus besoin de l’ID (noté “…1”).
if ("...1" %in% names(df)) {
  df <- subset(df, select = -1)
}
head(df)
## # A tibble: 6 × 7
##   Maternal_age Maternal_weight Maternal_height Parity Sex    Gestational_age
##          <dbl>           <dbl>           <dbl>  <dbl> <chr>            <dbl>
## 1         31.8            53.8             158      0 Female            143 
## 2         28.5            61               165      0 Female            240.
## 3         34.9            53.5             163      0 Female            247.
## 4         36.6            52               160      1 Male              284.
## 5         34.3            58               156      1 Male              246.
## 6         35.3            73               170      1 Female            148.
## # ℹ 1 more variable: Weight <dbl>
On détermine le nombre de valeurs manquantes dans chaque colonne
na_counts <- colSums(is.na(df))
print(na_counts)
##    Maternal_age Maternal_weight Maternal_height          Parity             Sex 
##               0               3               5               4               0 
## Gestational_age          Weight 
##               0               0
Au vu du faible nombre de valeurs manquantes par rapport à la taille du jeu de données, nous pouvons les supprimer sans problème. Nous vérifions ensuite qu’elles sont correctement supprimées.
df <- na.omit(df)
na_counts <- colSums(is.na(df))
print(na_counts)
##    Maternal_age Maternal_weight Maternal_height          Parity             Sex 
##               0               0               0               0               0 
## Gestational_age          Weight 
##               0               0
Vérifions s’il y a des colonnes contenant des valeurs nulles illogiques.
zero_counts <- colSums(df == 0)
print(zero_counts)
##    Maternal_age Maternal_weight Maternal_height          Parity             Sex 
##               0               2               0            4912               0 
## Gestational_age          Weight 
##               0               0
Parity est booléen, c’est donc normal d’avoir des valeurs nulles. En revanche, Maternal_weight ne devrait pas avoir de valeur nulle. Nous allons donc supprimer les lignes contenant des valeurs nulles dans cette colonne.
df <- df %>% filter(Maternal_weight != 0)
zero_counts <- colSums(df == 0)
print(zero_counts)
##    Maternal_age Maternal_weight Maternal_height          Parity             Sex 
##               0               0               0            4912               0 
## Gestational_age          Weight 
##               0               0
# Checking for duplicated lines
duplicated_rows <- df[duplicated(df), ]
print(duplicated_rows)
## # A tibble: 33 × 7
##    Maternal_age Maternal_weight Maternal_height Parity Sex    Gestational_age
##           <dbl>           <dbl>           <dbl>  <dbl> <chr>            <dbl>
##  1         24.4            63.5             160      1 Female            246.
##  2         32.1            56               156      0 Female            290.
##  3         33.2            81.8             165      0 Male              276.
##  4         27.5            60               165      0 Male              285.
##  5         29.8            77.5             169      0 Female            241.
##  6         19.5            67               158      0 Male              140 
##  7         32.7            74               164      0 Male              286.
##  8         29.8            49.5             152      0 Male              144.
##  9         31.6            59               170      1 Female            275.
## 10         19.9            54               162      0 Male              149.
## # ℹ 23 more rows
## # ℹ 1 more variable: Weight <dbl>
print(nrow(duplicated_rows))
## [1] 33
boxplot(df$Maternal_age, main = "Maternal age", ylab = "Y-axis Label")

boxplot(df$Maternal_weight, main = "Maternal weight", ylab = "Y-axis Label")

boxplot(df$Maternal_height, main = "Maternal height", ylab = "Y-axis Label")

boxplot(df$Parity, main = "Parity", ylab = "Y-axis Label")

boxplot(df$Gestational_age, main = "Gestational age", ylab = "Y-axis Label")

boxplot(df$Weight, main = "Weight", ylab = "Y-axis Label")

Concernant la parité, nous remarquons que la plupart des valeurs sont concentrées autour de 0 et 1, avec quelques valeurs aberrantes.
L’écart étant très grand, nous allons supprimer les valeurs aberrantes.
Avec seulement quelques données à la valeur élevée, il y aurait un risque que le modèle ne soit pas capable de généraliser correctement et ne sache pas comment traiter ces valeurs.
#Deleting entries with Parity greater than or equal to 5
df <- df %>% filter(Parity < 5)
boxplot(df$Parity, main = "Parity", ylab = "Y-axis Label")

Les deux dernières boites à moustache, l’une concernant le poids et l’autre l’âge gestationnel, démontrent une grande similitude. On peut donc supposer une corrélation apparente.

Nous allons maintenant visualiser cette corrélation en utilisant un nuage de points. Cela nous permet de visuellement vérifier la présence de valeur aberrante pour le poids, ce qui est compliqué autrement étant donné que celui-ci dépend de la période de gestation.

Nous tracerons ensuite des nuages de points de l’évolution dans le temps du poids selon chaque variable, prenant en compte la période de gestation.

library(ggplot2)

ggplot(df, aes(x = Gestational_age, y = Weight)) +
        geom_point() +
        ggtitle("Scatter plot of Weight vs Gestational_age") +
        xlab("Gestational_age") +
        ylab("Weight")

# Graphe de l'évolution du poids du bébé dans le temps selon l'âge de la mère
ggplot(df, aes(x = Maternal_age, y = Weight, color = Gestational_age)) +
        geom_point() +
        ggtitle("Scatter plot of Weight vs Maternal_age") +
        xlab("Maternal_age") +
        ylab("Weight")

# Graphe de l'évolution du poids du bébé dans le temps selon le poids de la mère
ggplot(df, aes(x = Maternal_weight, y = Weight, color = Gestational_age)) +
        geom_point() +
        ggtitle("Scatter plot of Weight vs Maternal_weight") +
        xlab("Maternal_weight") +
        ylab("Weight")

# Graphe de l'évolution du poids du bébé dans le temps selon la taille de la mère
ggplot(df, aes(x = Maternal_height, y = Weight, color = Gestational_age)) +
        geom_point() +
        ggtitle("Scatter plot of Weight vs Maternal_height") +
        xlab("Maternal_height") +
        ylab("Weight")

# Graphe de l'évolution du poids du bébé dans le temps selon la parité
ggplot(df, aes(x = Parity, y = Weight, color = Gestational_age)) +
        geom_point() +
        ggtitle("Scatter plot of Weight vs Parity") +
        xlab("Parity") +
        ylab("Weight")

# Graphe de l'évolution du poids du bébé dans le temps selon le genre
ggplot(df, aes(x = Sex, y = Weight, color = Gestational_age)) +
        geom_point() +
        ggtitle("Scatter plot of Weight vs Sex") +
        xlab("Sex") +
        ylab("Weight")

# Matrice de corrélation
correlation_matrix <- cor(df[, c("Maternal_age", "Maternal_weight", "Maternal_height", "Parity", "Gestational_age", "Weight")])
print(correlation_matrix)
##                 Maternal_age Maternal_weight Maternal_height       Parity
## Maternal_age     1.000000000     0.048289969      0.05299589  0.209497082
## Maternal_weight  0.048289969     1.000000000      0.33214205  0.074597879
## Maternal_height  0.052995887     0.332142053      1.00000000 -0.044621440
## Parity           0.209497082     0.074597879     -0.04462144  1.000000000
## Gestational_age  0.014648500     0.007372162      0.03292086 -0.006704868
## Weight           0.006479726     0.040356915      0.05786624  0.007200222
##                 Gestational_age      Weight
## Maternal_age        0.014648500 0.006479726
## Maternal_weight     0.007372162 0.040356915
## Maternal_height     0.032920855 0.057866241
## Parity             -0.006704868 0.007200222
## Gestational_age     1.000000000 0.973276339
## Weight              0.973276339 1.000000000
## Hypothèses La période de gestation est la variable la plus corrélée avec le poids du bébé, ce qui est logique. Néanmoins, nous avons également une corrélation significative entre le poids du bébé et le poids de la mère. Pour les autres variables, elles ont un impact sur le poids du bébé, mais il est moindre. Il n’y a pas de corrélation évidente entre les variables autres que le poids du bébé (hormis entre le poids et la taille de la mère, ce qui est logique et la corrélation n’est pas non plus si élevée).
Notre courbe prédite suivra une tendance générale selon la période de gestation, mais il y aura des variations selon les autres variables, notamment le poids de la mère.
De plus, maintenant que les valeurs aberrantes de parité sont supprimées, nous ne notons plus de valeurs aberrantes dans les autres variables.
# Préparation des données pour le modèle
## 1- Encodage des variables catégorielles
Il existe une variable catégorielle dans notre jeu de données, le sexe de l’enfant. Nous allons donc l’encoder en utilisant la fonction as.factor.
library(mltools)
library(data.table)
## 
## Attachement du package : 'data.table'
## L'objet suivant est masqué depuis 'package:purrr':
## 
##     transpose
## Les objets suivants sont masqués depuis 'package:dplyr':
## 
##     between, first, last
df$Sex <- as.factor(df$Sex)
## 2- Normalisation et standardisation
Nous allons maintenannt passer la période de gestation en semaine au lieu d’en jour.
Ensuite, nous pourrons créer une variable groupe qui sera utilisée pour un de nos modèles. Sa valeur, de 1 à 4, dépend de la période de gestation.
# Transformer Gestational_age de jours en semaines arrondies à l'entier supérieur
df <- df %>%
  mutate(Gestational_age = ceiling(Gestational_age / 7))

# Assigning a group to each line according to the gestational age
# Group 1 for gestational age : <23
# Group 2 for gestational age : 23-33
# Group 3 for gestational age : 34-36
# Group 4 for gestational age : >36
df$Group <- ifelse(df$Gestational_age < 23, 1, ifelse(df$Gestational_age < 34, 2, ifelse(df$Gestational_age < 37, 3, 4)))
df$Group <- as.factor(df$Group)

# vérifie que le fichier data.csv existe déjà
if (!file.exists("TreatedData/data.csv")) {
  write.csv(df, "TreatedData/data.csv", row.names = FALSE)
}
# Affichage d'un graphe pour visualiser la corrélation dans le temps ( gestational_age ) entre le poids et l'âge maternel

ggplot(df, aes(x = Gestational_age,y = Weight, color = Maternal_age)) +
  geom_point() +
  ggtitle("Scatter plot of Weight vs Maternal_age and Gestational_age") +
  xlab("Gestational_age") +
  ylab("Weight")

# Affichage d'un graphe pour visualiser la corrélation dans le temps ( gestational_age ) entre le poids et le poids de la mère

ggplot(df, aes(x = Gestational_age,y = Weight, color = Maternal_weight)) +
  geom_point() +
  ggtitle("Scatter plot of Weight vs Maternal_weight and Gestational_age") +
  xlab("Gestational_age") +
  ylab("Weight")

# Affichage d'un graphe pour visualiser la corrélation dans le temps ( gestational_age ) entre le poids et la taille de la mère

ggplot(df, aes(x = Gestational_age, y = Weight, color = Maternal_height)) +
  geom_point() +
  ggtitle("Scatter plot of Weight vs Maternal_height and Gestational_age") +
  xlab("Maternal_age") +
  ylab("Weight")

# Affichage d'un graphe pour visualiser la corrélation dans le temps ( gestational_age ) entre le poids et la parité

ggplot(df, aes(x = Gestational_age, y = Weight, color = Parity)) +
  geom_point() +
  ggtitle("Scatter plot of Weight vs Parity and Gestational_age") +
  xlab("Gestational_age") +
  ylab("Weight")

library(caret)
## Le chargement a nécessité le package : lattice
## 
## Attachement du package : 'caret'
## L'objet suivant est masqué depuis 'package:purrr':
## 
##     lift
set.seed(123) # for reproducibility
splitIndex <- createDataPartition(df$Weight, p = 0.8, list = FALSE)

# Create training and testing datasets
train_set <- df[splitIndex, ]
test_set <- df[-splitIndex, ]
# Cross-validation setup

train_control <- trainControl(
  method = "cv",
  number = 5
)
# Fit the model using cross-validation
model_linear <- train(
  log(Weight) ~ Maternal_age + Maternal_weight + Maternal_height + Parity + Sex + Gestational_age,
  data = train_set,
  method = "lm",
  trControl = train_control
)

# Summary of the model
summary(model_linear)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.80345 -0.13430 -0.01517  0.14210  0.60012 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     3.2272078  0.0585350  55.133  < 2e-16 ***
## Maternal_age    0.0001034  0.0004776   0.217  0.82856    
## Maternal_weight 0.0010403  0.0002002   5.196 2.09e-07 ***
## Maternal_height 0.0010509  0.0003662   2.869  0.00413 ** 
## Parity          0.0141629  0.0032939   4.300 1.73e-05 ***
## SexMale         0.0215901  0.0043303   4.986 6.31e-07 ***
## Gestational_age 0.1170591  0.0002691 434.996  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1843 on 7255 degrees of freedom
## Multiple R-squared:  0.9632, Adjusted R-squared:  0.9631 
## F-statistic: 3.162e+04 on 6 and 7255 DF,  p-value: < 2.2e-16
# Evaluate the model on the testing dataset
predictions <- exp(predict(model_linear, newdata = test_set))
test_mse <- mean((test_set$Weight - predictions)^2)
print(test_mse)
## [1] 226235.8
# Export the model to a file
saveRDS(model_linear, "Models/linear_model.rds")
# Modèle polynomial de croissance
model_p_c <- train(
  log(Weight) ~ poly(Gestational_age, 2) + Gestational_age + Maternal_age + Maternal_weight + Maternal_height + Parity + Sex + poly(Maternal_age, 2) + poly(Maternal_weight, 2) +
          poly(Maternal_height, 2) + poly(Parity, 2),
  data = train_set,
  method = "lm",
  trControl = train_control
)
# Résumé du modèle
summary(model_p_c)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.75047 -0.07586  0.00111  0.07508  0.42066 
## 
## Coefficients: (5 not defined because of singularities)
##                               Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)                  7.036e+00  3.616e-02  194.564  < 2e-16 ***
## `poly(Gestational_age, 2)1`  8.026e+01  1.132e-01  708.951  < 2e-16 ***
## `poly(Gestational_age, 2)2` -1.240e+01  1.131e-01 -109.626  < 2e-16 ***
## Gestational_age                     NA         NA       NA       NA    
## Maternal_age                -5.971e-04  2.950e-04   -2.024 0.042994 *  
## Maternal_weight              1.116e-03  1.235e-04    9.037  < 2e-16 ***
## Maternal_height              1.262e-03  2.270e-04    5.558 2.83e-08 ***
## Parity                       1.067e-02  2.025e-03    5.266 1.43e-07 ***
## SexMale                      2.268e-02  2.659e-03    8.529  < 2e-16 ***
## `poly(Maternal_age, 2)1`            NA         NA       NA       NA    
## `poly(Maternal_age, 2)2`     2.258e-01  1.135e-01    1.990 0.046655 *  
## `poly(Maternal_weight, 2)1`         NA         NA       NA       NA    
## `poly(Maternal_weight, 2)2` -3.856e-01  1.163e-01   -3.315 0.000920 ***
## `poly(Maternal_height, 2)1`         NA         NA       NA       NA    
## `poly(Maternal_height, 2)2`  3.959e-01  1.162e-01    3.408 0.000658 ***
## `poly(Parity, 2)1`                  NA         NA       NA       NA    
## `poly(Parity, 2)2`           1.974e-02  1.140e-01    0.173 0.862511    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.113 on 7250 degrees of freedom
## Multiple R-squared:  0.9862, Adjusted R-squared:  0.9861 
## F-statistic: 4.696e+04 on 11 and 7250 DF,  p-value: < 2.2e-16
# Test de Kolmogorov-Smirnov avec la distribution normale
ks.test(residuals(model_p_c), "pnorm", mean(residuals(model_p_c)), sd(residuals(model_p_c)))
## Warning in ks.test.default(residuals(model_p_c), "pnorm",
## mean(residuals(model_p_c)), : aucun ex-aequo ne devrait être présent pour le
## test de Kolmogorov-Smirnov
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  residuals(model_p_c)
## D = 0.01207, p-value = 0.2406
## alternative hypothesis: two-sided
# Calcul des intervalles de confiance
final_model <- model_p_c$finalModel
model_confidence_intervals <- confint(final_model)
print(model_confidence_intervals)
##                                     2.5 %        97.5 %
## (Intercept)                  6.965035e+00  7.106813e+00
## `poly(Gestational_age, 2)1`  8.003917e+01  8.048303e+01
## `poly(Gestational_age, 2)2` -1.262230e+01 -1.217882e+01
## Gestational_age                        NA            NA
## Maternal_age                -1.175445e-03 -1.883232e-05
## Maternal_weight              8.736487e-04  1.357659e-03
## Maternal_height              8.165136e-04  1.706412e-03
## Parity                       6.696376e-03  1.463737e-02
## SexMale                      1.746517e-02  2.788969e-02
## `poly(Maternal_age, 2)1`               NA            NA
## `poly(Maternal_age, 2)2`     3.344110e-03  4.482371e-01
## `poly(Maternal_weight, 2)1`            NA            NA
## `poly(Maternal_weight, 2)2` -6.135294e-01 -1.575911e-01
## `poly(Maternal_height, 2)1`            NA            NA
## `poly(Maternal_height, 2)2`  1.681497e-01  6.235563e-01
## `poly(Parity, 2)1`                     NA            NA
## `poly(Parity, 2)2`          -2.036627e-01  2.431358e-01
predictions <- exp(predict(model_p_c, newdata = test_set))
test_mse <- mean((test_set$Weight - predictions)^2)
print(test_mse)
## [1] 73318.49
# Export the model to a file
saveRDS(model_p_c, "Models/p_c_model.rds")
# Mixed linear model
library(lme4)
## Le chargement a nécessité le package : Matrix
# Cross-validation setup
set.seed(123)
n_folds <- 5
folds <- sample(rep(1:n_folds, length.out = nrow(train_set)))

mse_results <- numeric(n_folds)

for (i in 1:n_folds) {
  test_indices <- which(folds == i)
  train_indices <- setdiff(1:nrow(train_set), test_indices)

  # Subset the data
  training_data <- train_set[train_indices, ]
  testing_data <- train_set[test_indices, ]

  # Fit the model on training data
  model_mixed <- lmer(log(Weight) ~ Maternal_age + Maternal_weight + Maternal_height +
    Parity + Sex + Gestational_age + (1 | Group), data = training_data)

  # Predict and calculate MSE on the testing fold
  predictions <- exp(predict(model_mixed, newdata = testing_data))
  mse_results[i] <- mean((testing_data$Weight - predictions)^2)
}

# Calculate the average MSE over all folds
average_mse <- mean(mse_results)
print(average_mse)
## [1] 78631.61
predictions_final <- predict(model_mixed, newdata = test_set)
final_test_mse <- mean((test_set$Weight - predictions_final)^2)
print(final_test_mse)
## [1] 5941364
# Export the mixed model to a file
saveRDS(model_mixed, "Models/mixed_linear_model.rds")
library(ggplot2)
library(dplyr)

# Create a grid of data to simulate a wider variety of input scenarios

simulated_data <- expand.grid(
        Gestational_age = seq(min(df$Gestational_age), max(df$Gestational_age), by = 0.5),
        Maternal_age = quantile(df$Maternal_age, c(0.25, 0.5, 0.75)),
        Maternal_weight = quantile(df$Maternal_weight, c(0.25, 0.5, 0.75)),
        Maternal_height = quantile(df$Maternal_height, c(0.25, 0.5, 0.75)),
        Parity = unique(df$Parity),
        Sex = factor(levels(df$Sex)),
        Group = factor(levels(df$Group))
)

# Predict weights for the entire grid for all models
simulated_data$Weight_Linear <- exp(predict(model_linear, newdata = simulated_data))
simulated_data$Weight_Poly_C <- exp(predict(model_p_c, newdata = simulated_data))
simulated_data$Weight_Mixed <- exp(predict(model_mixed, newdata = simulated_data))


# Function to calculate and plot percentiles
plot_percentiles <- function(data, weight_col, title, color) {
  data %>%
          group_by(Gestational_age) %>%
          summarise(
                  Weight = mean(!!sym(weight_col), na.rm = TRUE),
                  Percentile1 = quantile(!!sym(weight_col), 0.01, na.rm = TRUE),
                  Percentile3 = quantile(!!sym(weight_col), 0.03, na.rm = TRUE),
                  Percentile10 = quantile(!!sym(weight_col), 0.1, na.rm = TRUE),
                  Percentile90 = quantile(!!sym(weight_col), 0.9, na.rm = TRUE),
                  Percentile97 = quantile(!!sym(weight_col), 0.97, na.rm = TRUE),
                  Percentile99 = quantile(!!sym(weight_col), 0.99, na.rm = TRUE)
          ) %>%
          ggplot(aes(x = Gestational_age)) +
          geom_line(aes(y = Weight), color = color, size = 1.2) +
          geom_line(aes(y = Percentile1), color = "pink", linetype = "dashed") +
          geom_line(aes(y = Percentile3), color = "blue", linetype = "dashed") +
          geom_line(aes(y = Percentile10), color = "green", linetype = "dashed") +
          geom_line(aes(y = Percentile90), color = "orange", linetype = "dashed") +
          geom_line(aes(y = Percentile97), color = "red", linetype = "dashed") +
          geom_line(aes(y = Percentile99), color = "purple", linetype = "dashed") +
          labs(title = title, y = "Predicted Weight", x = "Gestational Age")
}

# Plot percentiles for both models
p_linear <- plot_percentiles(simulated_data, "Weight_Linear", "Linear Model: Growth Percentiles", "black")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
p_poly_c <- plot_percentiles(simulated_data, "Weight_Poly_C", "Polynomial Model: Growth Percentiles", "black")
p_mixed <- plot_percentiles(simulated_data, "Weight_Mixed", "Mixed Linear Model: Growth Percentiles", "black")
# Calculate average weights for each gestational age for all models
comparison_data <- simulated_data %>%
        group_by(Gestational_age) %>%
        summarise(
                Average_Weight_Linear = mean(Weight_Linear, na.rm = TRUE),
                Average_Weight_Poly_C = mean(Weight_Poly_C, na.rm = TRUE),
                Average_Weight_Mixed = mean(Weight_Mixed, na.rm = TRUE)
        )

# Plot graph with curves from all models
p_comparison <- ggplot(comparison_data, aes(x = Gestational_age)) +
        geom_line(aes(y = Average_Weight_Linear, colour = "Linear Model"), size = 1.2) +
        geom_line(aes(y = Average_Weight_Poly_C, colour = "Polynomial C Model"), size = 1.2) +
        geom_line(aes(y = Average_Weight_Mixed, colour = "Mixed Linear Model"), size = 1.2) +
        scale_colour_manual(values = c("Linear Model" = "blue", "Polynomial Model" = "red", "Polynomial C Model" = "black", "Mixed Linear Model" = "green")) +
        labs(title = "Comparison of Linear, Polynomials and Mixed Models Predictions",
             y = "Average Predicted Weight", x = "Gestational Age",
             colour = "Model Type")


# Print plots
print(p_linear)

print(p_poly_c)

print(p_mixed)

print(p_comparison)

# Predicted label vs actual label comparison for the linear model
comparison_data_linear <- data.frame(
        Actual = test_set$Weight,
        Predicted = exp(predict(model_linear, newdata = test_set))
)

# Predicted label vs actual label comparison for the mixed linear model
comparison_data_mixed <- data.frame(
        Actual = test_set$Weight,
        Predicted = exp(predict(model_mixed, newdata = test_set))
)

# Predicted label vs actual label comparison for the poly-c model
comparison_data_p_c <- data.frame(
        Actual = test_set$Weight,
        Predicted = exp(predict(model_p_c, newdata = test_set))
)

# Plot the comparison
p_comparison_linear <- ggplot(comparison_data_linear, aes(x = Actual, y = Predicted)) +
        geom_point() +
        geom_abline(intercept = 0, slope = 1, color = "red") +
        ggtitle("Linear Model: Predicted vs Actual Weights") +
        xlab("Actual Weight") +
        ylab("Predicted Weight")

p_comparison_mixed <- ggplot(comparison_data_mixed, aes(x = Actual, y = Predicted)) +
        geom_point() +
        geom_abline(intercept = 0, slope = 1, color = "red") +
        ggtitle("Mixed Linear Model: Predicted vs Actual Weights") +
        xlab("Actual Weight") +
        ylab("Predicted Weight")

p_comparison_p_c <- ggplot(comparison_data_p_c, aes(x = Actual, y = Predicted)) +
        geom_point() +
        geom_abline(intercept = 0, slope = 1, color = "red") +
        ggtitle("Polynomial C Model: Predicted vs Actual Weights") +
        xlab("Actual Weight") +
        ylab("Predicted Weight")

# Print plots
print(p_comparison_linear)

print(p_comparison_mixed)

print(p_comparison_p_c)

# Residuals for all models
residuals_linear <- residuals(model_linear)
residuals_mixed <- residuals(model_mixed)
residuals_p_c <- residuals(model_p_c)

# Plot residuals for all models
p_residuals_linear <- ggplot(data.frame(Residuals = residuals_linear), aes(x = Residuals)) +
        geom_histogram(bins = 30, fill = "blue") +
        ggtitle("Linear Model: Residuals Distribution") +
        xlab("Residuals") +
        ylab("Frequency")

p_residuals_mixed <- ggplot(data.frame(Residuals = residuals_mixed), aes(x = Residuals)) +
        geom_histogram(bins = 30, fill = "green") +
        ggtitle("Mixed Linear Model: Residuals Distribution") +
        xlab("Residuals") +
        ylab("Frequency")

p_residuals_p_c <- ggplot(data.frame(Residuals = residuals_p_c), aes(x = Residuals)) +
        geom_histogram(bins = 30, fill = "red") +
        ggtitle("Polynomial C Model: Residuals Distribution") +
        xlab("Residuals") +
        ylab("Frequency")

# Print plots
print(p_residuals_linear)

print(p_residuals_mixed)

print(p_residuals_p_c)

# Calculations of the R2 scores for all models
r2_linear <- cor(test_set$Weight, exp(predict(model_linear, newdata = test_set)))^2
r2_mixed <- cor(test_set$Weight, exp(predict(model_mixed, newdata = test_set)))^2
r2_p_c <- cor(test_set$Weight, exp(predict(model_p_c, newdata = test_set)))^2

print(r2_linear)
## [1] 0.8858488
print(r2_mixed)
## [1] 0.9519531
print(r2_p_c)
## [1] 0.9536838
# Calculations of the RMSE for all models
rmse_linear <- sqrt(mean((test_set$Weight - exp(predict(model_linear, newdata = test_set)))^2))
rmse_mixed <- sqrt(mean((test_set$Weight - exp(predict(model_mixed, newdata = test_set)))^2))
rmse_p_c <- sqrt(mean((test_set$Weight - exp(predict(model_p_c, newdata = test_set)))^2))

print(rmse_linear)
## [1] 475.6425
print(rmse_mixed)
## [1] 275.546
print(rmse_p_c)
## [1] 270.7739
# Calculations of the MAE for all models
mae_linear <- mean(abs(test_set$Weight - exp(predict(model_linear, newdata = test_set))))
mae_mixed <- mean(abs(test_set$Weight - exp(predict(model_mixed, newdata = test_set))))
mae_p_c <- mean(abs(test_set$Weight - exp(predict(model_p_c, newdata = test_set))))

print(mae_linear)
## [1] 338.5161
print(mae_mixed)
## [1] 186.1788
print(mae_p_c)
## [1] 183.0608
# Linear model R2 score for gestational age < 23
test_set_group1 <- test_set[test_set$Group == 1, ]
r2_linear_group1 <- cor(test_set_group1$Weight, exp(predict(model_linear, newdata = test_set_group1)))^2

# Linear model R2 score for gestational age 23-33
test_set_group2 <- test_set[test_set$Group == 2, ]
r2_linear_group2 <- cor(test_set_group2$Weight, exp(predict(model_linear, newdata = test_set_group2)))^2

# Linear model R2 score for gestational age 34-36
test_set_group3 <- test_set[test_set$Group == 3, ]
r2_linear_group3 <- cor(test_set_group3$Weight, exp(predict(model_linear, newdata = test_set_group3)))^2

# Linear model R2 score for gestational age > 36
test_set_group4 <- test_set[test_set$Group == 4, ]
r2_linear_group4 <- cor(test_set_group4$Weight, exp(predict(model_linear, newdata = test_set_group4)))^2

print(r2_linear_group1)
## [1] 0.2926218
print(r2_linear_group2)
## [1] 0.7743751
print(r2_linear_group3)
## [1] 0.1021505
print(r2_linear_group4)
## [1] 0.2874344
# Polynomial C model R2 score for gestational age < 23
r2_p_c_group1 <- cor(test_set_group1$Weight, exp(predict(model_p_c, newdata = test_set_group1)))^2

# Polynomial C model R2 score for gestational age 23-33
r2_p_c_group2 <- cor(test_set_group2$Weight, exp(predict(model_p_c, newdata = test_set_group2)))^2

# Polynomial C model R2 score for gestational age 34-36
r2_p_c_group3 <- cor(test_set_group3$Weight, exp(predict(model_p_c, newdata = test_set_group3)))^2

# Polynomial C model R2 score for gestational age > 36
r2_p_c_group4 <- cor(test_set_group4$Weight, exp(predict(model_p_c, newdata = test_set_group4)))^2

print(r2_p_c_group1)
## [1] 0.2950266
print(r2_p_c_group2)
## [1] 0.7939593
print(r2_p_c_group3)
## [1] 0.1226058
print(r2_p_c_group4)
## [1] 0.3195641
# Amount of foetuses over 90th percentile according to our polynomial model
test_set$Predicted_Weight_Poly_C <- exp(predict(model_p_c, newdata = test_set))
test_set$Predicted_Weight_Poly_C <- as.numeric(test_set$Predicted_Weight_Poly_C)
test_set$Weight <- as.numeric(test_set$Weight)

test_set$Over_90th_Percentile <- ifelse(test_set$Predicted_Weight_Poly_C > quantile(test_set$Predicted_Weight_Poly_C, 0.9), 1, 0)
over_90th_percentile <- sum(test_set$Over_90th_Percentile)
print(over_90th_percentile)
## [1] 182
# Amount of foetuses under 10th percentile according to our polynomial model
test_set$Under_10th_Percentile <- ifelse(test_set$Predicted_Weight_Poly_C < quantile(test_set$Predicted_Weight_Poly_C, 0.1), 1, 0)
under_10th_percentile <- sum(test_set$Under_10th_Percentile)
print(under_10th_percentile)
## [1] 182
# To assess the performance of the percentile growth curves, it is necessary to estimate the rate of weights that fall above or below a percentile curve.
test_set$Predicted_Weight_Poly_C <- exp(predict(model_p_c, newdata = test_set))
test_set$Predicted_Weight_Poly_C <- as.numeric(test_set$Predicted_Weight_Poly_C)
test_set$Weight <- as.numeric(test_set$Weight)

test_set$Over_90th_Percentile <- ifelse(test_set$Predicted_Weight_Poly_C > quantile(test_set$Predicted_Weight_Poly_C, 0.9), 1, 0)
over_90th_percentile <- sum(test_set$Over_90th_Percentile)


# Value in percentage
over_90th_percentile_percent <- over_90th_percentile / nrow(test_set) * 100

print(over_90th_percentile)
## [1] 182
print(over_90th_percentile_percent)
## [1] 10.03308
# Amount of foetuses under 10th percentile according to our polynomial model
test_set$Under_10th_Percentile <- ifelse(test_set$Predicted_Weight_Poly_C < quantile(test_set$Predicted_Weight_Poly_C, 0.1), 1, 0)
under_10th_percentile <- sum(test_set$Under_10th_Percentile)

# Value in percentage
under_10th_percentile_percent <- under_10th_percentile / nrow(test_set) * 100

print(under_10th_percentile)
## [1] 182
print(under_10th_percentile_percent)
## [1] 10.03308